Import Libraries

library(tidyverse)
library(pROC)
Type 'citation("pROC")' for a citation.

Attaching package: ‘pROC’

The following objects are masked from ‘package:stats’:

    cov, smooth, var
library(sparklyr)

Helper Functions

#' ROC curve code
#'
#' Based on algo 1 page 866, Fawcett2005
#'
#' @param L observations 
#' @param f, predicted prob.
#'
#' @return points in ROC space and score
get_roc <- function(L, f) {
  # Calculate P and N
  P <- sum(L==1)
  N <- sum(L==0)
  # Order the observations by prediction
  df  <- tibble(L, f)
  df <- df %>% arrange(desc(f))
  # Set TP and FP to zero
  TP <- 0
  FP <- 0
  # Set up matrix for results
  R <- NULL
  # Set previous f
  f_prev <- -Inf
  # set counter
  i <- 1
  while(i <= length(df$L)){
    if( df$f[i] != f_prev){
      R <- rbind(R, c(FP/N, TP/P, df$f[i]))
      f_prev <- df$f[i]
    }
    if(df$L[i] == 1){
      TP <- TP + 1
    } else {
      FP <- FP + 1
    }
    i <- i + 1
  }
  R <- rbind(R, c(FP/N, TP/P, f_prev))
  R <- data.frame(R)
  colnames(R) <- c("FPR","TPR", "Score")
  return(R)
}

Connect to Spark

sc <- spark_connect(master = "local")
* Using Spark: 3.1.1
  1. Read in the reddit dataset correctly and provide evidence of your code.
# read the dataset
reddit <- read.csv("RS_2017-09_filtered70.csv")
reddit <- as_tibble(reddit)
reddit
# data selection
reddit.selection <- select(reddit, brand_safe, num_crossposts, over_18, is_self, is_reddit_media_domain, is_video, stickied, spoiler)

# convert the brand_safe variable to 1 and 0
reddit.selection <- reddit.selection %>% 
  mutate(brand_safe=as.logical(as.logical(brand_safe))) %>%
  mutate(over_18=as.logical(as.logical(over_18))) %>%
  mutate(is_self=as.logical(as.logical(is_self))) %>%
  mutate(is_reddit_media_domain=as.logical(as.logical(is_reddit_media_domain))) %>%
  mutate(is_video=as.logical(as.logical(is_video))) %>%
  mutate(stickied=as.logical(as.logical(stickied))) %>%
  mutate(spoiler=as.logical(as.logical(spoiler)))

# data summary
summary(reddit.selection)
 brand_safe      num_crossposts       over_18         is_self        is_reddit_media_domain  is_video      
 Mode :logical   Min.   :0.0000000   Mode :logical   Mode :logical   Mode :logical          Mode :logical  
 FALSE:65141     1st Qu.:0.0000000   FALSE:127226    FALSE:78627     FALSE:122255           FALSE:139379   
 TRUE :74682     Median :0.0000000   TRUE :12597     TRUE :61196     TRUE :17568            TRUE :444      
                 Mean   :0.0007509                                                                         
                 3rd Qu.:0.0000000                                                                         
                 Max.   :3.0000000                                                                         
  stickied        spoiler       
 Mode :logical   Mode :logical  
 FALSE:139724    FALSE:138983   
 TRUE :99        TRUE :840      
                                
                                
                                
  1. Build a classifier based on the six factors and provide the classifier information. Provide the correct analysis about the factors.
# copy the dataset to spark
reddit_tbl <- copy_to(sc, reddit.selection, "reddit", overwrite=TRUE)

# split the data in training and testing data sets
partitions <- reddit_tbl %>%
  na.omit() %>%
  sdf_random_split(training=0.7, test=0.3, seed=42)
* No rows dropped by 'na.omit' call
train_tbl <- partitions$train
test_tbl <- partitions$test
# select the factors to include in the model
factors <- c("over_18", "is_self", "is_reddit_media_domain", "is_video", "stickied", "spoiler")
formula <- as.formula(paste("brand_safe ~ ", paste(factors, collapse="+")))
test_model <- function(title, model, formula, data) {
  # build the model
  ml_model <- data %>%
    model(formula)

  # perform the predictions
  predictions <- ml_predict(ml_model, data)

  # print the AUC
  print(paste(
    title,
    ml_binary_classification_evaluator(predictions, metric_name = "areaUnderROC")))
}

test_model("ml_logistic_regression", ml_logistic_regression, formula, train_tbl)
[1] "ml_logistic_regression 0.700856558338718"
test_model("ml_random_forest", ml_random_forest, formula, train_tbl)
[1] "ml_random_forest 0.700699574974018"
test_model("ml_gradient_boosted_trees", ml_gradient_boosted_trees, formula, train_tbl)
[1] "ml_gradient_boosted_trees 0.701015105927723"
# create the model
ml_model <- train_tbl %>%
  ml_gradient_boosted_trees(formula)

# perform the predictions
predictions <- ml_predict(ml_model, train_tbl)

head(predictions)
  predictions$p1 <- unlist(pred_lr$probability)[ c(FALSE,TRUE) ]
Error in unlist(pred_lr$probability) : object 'pred_lr' not found
#ml_log <- train_tbl %>%
  #ml_random_forest(formula)
  #ml_gradient_boosted_trees(formula)
  #ml_logistic_regression(formula)

#pred.train <- ml_predict(ml_log, train_tbl)
#pred.test <- ml_predict(ml_log, test_tbl)

#ml_binary_classification_evaluator(pred.train, metric_name = "areaUnderROC")
#ml_binary_classification_evaluator(pred.test, metric_name = "areaUnderROC")
# perform the predictions
#pred_lr <- pred.train %>% collect
#pred_lr$p1 <- unlist(pred_lr$probability)[ c(FALSE,TRUE) ]

# calculate the roc values
#ROC_lr <- get_roc(L = pred_lr$late_arrival, f = pred_lr$p1)
  
# plot the roc curve
#ggplot(ROC_lr, aes(x = FPR, y = TPR)) + geom_line(aes(col = "Model Prediction"))
LS0tCnRpdGxlOiAiUHJvamVjdCBUYXNrIDIgLSBzcGFya2x5ciIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKIyBJbXBvcnQgTGlicmFyaWVzCgpgYGB7cn0KbGlicmFyeSh0aWR5dmVyc2UpCmxpYnJhcnkocFJPQykKbGlicmFyeShzcGFya2x5cikKYGBgCiMgSGVscGVyIEZ1bmN0aW9ucwoKYGBge3J9CiMnIFJPQyBjdXJ2ZSBjb2RlCiMnCiMnIEJhc2VkIG9uIGFsZ28gMSBwYWdlIDg2NiwgRmF3Y2V0dDIwMDUKIycKIycgQHBhcmFtIEwgb2JzZXJ2YXRpb25zIAojJyBAcGFyYW0gZiwgcHJlZGljdGVkIHByb2IuCiMnCiMnIEByZXR1cm4gcG9pbnRzIGluIFJPQyBzcGFjZSBhbmQgc2NvcmUKZ2V0X3JvYyA8LSBmdW5jdGlvbihMLCBmKSB7CiAgIyBDYWxjdWxhdGUgUCBhbmQgTgogIFAgPC0gc3VtKEw9PTEpCiAgTiA8LSBzdW0oTD09MCkKICAjIE9yZGVyIHRoZSBvYnNlcnZhdGlvbnMgYnkgcHJlZGljdGlvbgogIGRmICA8LSB0aWJibGUoTCwgZikKICBkZiA8LSBkZiAlPiUgYXJyYW5nZShkZXNjKGYpKQogICMgU2V0IFRQIGFuZCBGUCB0byB6ZXJvCiAgVFAgPC0gMAogIEZQIDwtIDAKICAjIFNldCB1cCBtYXRyaXggZm9yIHJlc3VsdHMKICBSIDwtIE5VTEwKICAjIFNldCBwcmV2aW91cyBmCiAgZl9wcmV2IDwtIC1JbmYKICAjIHNldCBjb3VudGVyCiAgaSA8LSAxCiAgd2hpbGUoaSA8PSBsZW5ndGgoZGYkTCkpewogICAgaWYoIGRmJGZbaV0gIT0gZl9wcmV2KXsKICAgICAgUiA8LSByYmluZChSLCBjKEZQL04sIFRQL1AsIGRmJGZbaV0pKQogICAgICBmX3ByZXYgPC0gZGYkZltpXQogICAgfQogICAgaWYoZGYkTFtpXSA9PSAxKXsKICAgICAgVFAgPC0gVFAgKyAxCiAgICB9IGVsc2UgewogICAgICBGUCA8LSBGUCArIDEKICAgIH0KICAgIGkgPC0gaSArIDEKICB9CiAgUiA8LSByYmluZChSLCBjKEZQL04sIFRQL1AsIGZfcHJldikpCiAgUiA8LSBkYXRhLmZyYW1lKFIpCiAgY29sbmFtZXMoUikgPC0gYygiRlBSIiwiVFBSIiwgIlNjb3JlIikKICByZXR1cm4oUikKfQpgYGAKCgojIENvbm5lY3QgdG8gU3BhcmsKCmBgYHtyfQpzYyA8LSBzcGFya19jb25uZWN0KG1hc3RlciA9ICJsb2NhbCIpCmBgYAoKMS4JUmVhZCBpbiB0aGUgcmVkZGl0IGRhdGFzZXQgY29ycmVjdGx5IGFuZCBwcm92aWRlIGV2aWRlbmNlIG9mIHlvdXIgY29kZS4gCgpgYGB7cn0KIyByZWFkIHRoZSBkYXRhc2V0CnJlZGRpdCA8LSByZWFkLmNzdigiUlNfMjAxNy0wOV9maWx0ZXJlZDcwLmNzdiIpCnJlZGRpdCA8LSBhc190aWJibGUocmVkZGl0KQpyZWRkaXQKYGBgCgpgYGB7cn0KIyBkYXRhIHNlbGVjdGlvbgpyZWRkaXQuc2VsZWN0aW9uIDwtIHNlbGVjdChyZWRkaXQsIGJyYW5kX3NhZmUsIG51bV9jcm9zc3Bvc3RzLCBvdmVyXzE4LCBpc19zZWxmLCBpc19yZWRkaXRfbWVkaWFfZG9tYWluLCBpc192aWRlbywgc3RpY2tpZWQsIHNwb2lsZXIpCgojIGNvbnZlcnQgdGhlIGJyYW5kX3NhZmUgdmFyaWFibGUgdG8gMSBhbmQgMApyZWRkaXQuc2VsZWN0aW9uIDwtIHJlZGRpdC5zZWxlY3Rpb24gJT4lIAogIG11dGF0ZShicmFuZF9zYWZlPWFzLmxvZ2ljYWwoYXMubG9naWNhbChicmFuZF9zYWZlKSkpICU+JQogIG11dGF0ZShvdmVyXzE4PWFzLmxvZ2ljYWwoYXMubG9naWNhbChvdmVyXzE4KSkpICU+JQogIG11dGF0ZShpc19zZWxmPWFzLmxvZ2ljYWwoYXMubG9naWNhbChpc19zZWxmKSkpICU+JQogIG11dGF0ZShpc19yZWRkaXRfbWVkaWFfZG9tYWluPWFzLmxvZ2ljYWwoYXMubG9naWNhbChpc19yZWRkaXRfbWVkaWFfZG9tYWluKSkpICU+JQogIG11dGF0ZShpc192aWRlbz1hcy5sb2dpY2FsKGFzLmxvZ2ljYWwoaXNfdmlkZW8pKSkgJT4lCiAgbXV0YXRlKHN0aWNraWVkPWFzLmxvZ2ljYWwoYXMubG9naWNhbChzdGlja2llZCkpKSAlPiUKICBtdXRhdGUoc3BvaWxlcj1hcy5sb2dpY2FsKGFzLmxvZ2ljYWwoc3BvaWxlcikpKQoKIyBkYXRhIHN1bW1hcnkKc3VtbWFyeShyZWRkaXQuc2VsZWN0aW9uKQpgYGAKMi4JQnVpbGQgYSBjbGFzc2lmaWVyIGJhc2VkIG9uIHRoZSBzaXggZmFjdG9ycyBhbmQgcHJvdmlkZSB0aGUgY2xhc3NpZmllciBpbmZvcm1hdGlvbi4gUHJvdmlkZSB0aGUgY29ycmVjdCBhbmFseXNpcyBhYm91dCB0aGUgZmFjdG9ycy4gCgoKYGBge3J9CiMgY29weSB0aGUgZGF0YXNldCB0byBzcGFyawpyZWRkaXRfdGJsIDwtIGNvcHlfdG8oc2MsIHJlZGRpdC5zZWxlY3Rpb24sICJyZWRkaXQiLCBvdmVyd3JpdGU9VFJVRSkKCiMgc3BsaXQgdGhlIGRhdGEgaW4gdHJhaW5pbmcgYW5kIHRlc3RpbmcgZGF0YSBzZXRzCnBhcnRpdGlvbnMgPC0gcmVkZGl0X3RibCAlPiUKICBuYS5vbWl0KCkgJT4lCiAgc2RmX3JhbmRvbV9zcGxpdCh0cmFpbmluZz0wLjcsIHRlc3Q9MC4zLCBzZWVkPTQyKQoKdHJhaW5fdGJsIDwtIHBhcnRpdGlvbnMkdHJhaW4KdGVzdF90YmwgPC0gcGFydGl0aW9ucyR0ZXN0CmBgYApgYGB7cn0KIyBzZWxlY3QgdGhlIGZhY3RvcnMgdG8gaW5jbHVkZSBpbiB0aGUgbW9kZWwKZmFjdG9ycyA8LSBjKCJvdmVyXzE4IiwgImlzX3NlbGYiLCAiaXNfcmVkZGl0X21lZGlhX2RvbWFpbiIsICJpc192aWRlbyIsICJzdGlja2llZCIsICJzcG9pbGVyIikKZm9ybXVsYSA8LSBhcy5mb3JtdWxhKHBhc3RlKCJicmFuZF9zYWZlIH4gIiwgcGFzdGUoZmFjdG9ycywgY29sbGFwc2U9IisiKSkpCmBgYAoKYGBge3J9CnRlc3RfbW9kZWwgPC0gZnVuY3Rpb24odGl0bGUsIG1vZGVsLCBmb3JtdWxhLCBkYXRhKSB7CiAgIyBidWlsZCB0aGUgbW9kZWwKICBtbF9tb2RlbCA8LSBkYXRhICU+JQogICAgbW9kZWwoZm9ybXVsYSkKCiAgIyBwZXJmb3JtIHRoZSBwcmVkaWN0aW9ucwogIHByZWRpY3Rpb25zIDwtIG1sX3ByZWRpY3QobWxfbW9kZWwsIGRhdGEpCgogICMgcHJpbnQgdGhlIEFVQwogIHByaW50KHBhc3RlKAogICAgdGl0bGUsCiAgICBtbF9iaW5hcnlfY2xhc3NpZmljYXRpb25fZXZhbHVhdG9yKHByZWRpY3Rpb25zLCBtZXRyaWNfbmFtZSA9ICJhcmVhVW5kZXJST0MiKSkpCn0KCnRlc3RfbW9kZWwoIm1sX2xvZ2lzdGljX3JlZ3Jlc3Npb24iLCBtbF9sb2dpc3RpY19yZWdyZXNzaW9uLCBmb3JtdWxhLCB0cmFpbl90YmwpCnRlc3RfbW9kZWwoIm1sX3JhbmRvbV9mb3Jlc3QiLCBtbF9yYW5kb21fZm9yZXN0LCBmb3JtdWxhLCB0cmFpbl90YmwpCnRlc3RfbW9kZWwoIm1sX2dyYWRpZW50X2Jvb3N0ZWRfdHJlZXMiLCBtbF9ncmFkaWVudF9ib29zdGVkX3RyZWVzLCBmb3JtdWxhLCB0cmFpbl90YmwpCmBgYApgYGB7cn0KIyBjcmVhdGUgdGhlIG1vZGVsCm1sX21vZGVsIDwtIHRyYWluX3RibCAlPiUKICBtbF9ncmFkaWVudF9ib29zdGVkX3RyZWVzKGZvcm11bGEpCgojIHBlcmZvcm0gdGhlIHByZWRpY3Rpb25zCnByZWRpY3Rpb25zIDwtIG1sX3ByZWRpY3QobWxfbW9kZWwsIHRyYWluX3RibCkKCmhlYWQocHJlZGljdGlvbnMpCmBgYAoKYGBge3J9CiAgcHJlZGljdGlvbnMkcDEgPC0gdW5saXN0KHByZWRpY3Rpb25zJHByb2JhYmlsaXR5KVsgYyhGQUxTRSxUUlVFKSBdCiAgCiAgIyBjYWxjdWxhdGUgdGhlIHJvYyB2YWx1ZXMKICAjcmV0dXJuIChnZXRfcm9jKEwgPSBwcmVkX2xyJGxhdGVfYXJyaXZhbCwgZiA9IHByZWRfbHIkcDEpKQpgYGAKCgpgYGB7cn0KI21sX2xvZyA8LSB0cmFpbl90YmwgJT4lCiAgI21sX3JhbmRvbV9mb3Jlc3QoZm9ybXVsYSkKICAjbWxfZ3JhZGllbnRfYm9vc3RlZF90cmVlcyhmb3JtdWxhKQogICNtbF9sb2dpc3RpY19yZWdyZXNzaW9uKGZvcm11bGEpCgojcHJlZC50cmFpbiA8LSBtbF9wcmVkaWN0KG1sX2xvZywgdHJhaW5fdGJsKQojcHJlZC50ZXN0IDwtIG1sX3ByZWRpY3QobWxfbG9nLCB0ZXN0X3RibCkKCiNtbF9iaW5hcnlfY2xhc3NpZmljYXRpb25fZXZhbHVhdG9yKHByZWQudHJhaW4sIG1ldHJpY19uYW1lID0gImFyZWFVbmRlclJPQyIpCiNtbF9iaW5hcnlfY2xhc3NpZmljYXRpb25fZXZhbHVhdG9yKHByZWQudGVzdCwgbWV0cmljX25hbWUgPSAiYXJlYVVuZGVyUk9DIikKYGBgCmBgYHtyfQojIHBlcmZvcm0gdGhlIHByZWRpY3Rpb25zCiNwcmVkX2xyIDwtIHByZWQudHJhaW4gJT4lIGNvbGxlY3QKI3ByZWRfbHIkcDEgPC0gdW5saXN0KHByZWRfbHIkcHJvYmFiaWxpdHkpWyBjKEZBTFNFLFRSVUUpIF0KCiMgY2FsY3VsYXRlIHRoZSByb2MgdmFsdWVzCiNST0NfbHIgPC0gZ2V0X3JvYyhMID0gcHJlZF9sciRsYXRlX2Fycml2YWwsIGYgPSBwcmVkX2xyJHAxKQogIAojIHBsb3QgdGhlIHJvYyBjdXJ2ZQojZ2dwbG90KFJPQ19sciwgYWVzKHggPSBGUFIsIHkgPSBUUFIpKSArIGdlb21fbGluZShhZXMoY29sID0gIk1vZGVsIFByZWRpY3Rpb24iKSkKYGBgCgo=